library(dataQualityR) #for data cleaning
library(e1071) #SVM Training & Testing Models
library(mice) #deal with missing data
library(corrplot) #plot for correlation matrix
library(ggplot2) #visualization and plots
library(ggpubr) #customizing ggplot2
library(scales) #graphical scales map data to aesthetics
library(caret) #deal with Classification And REgression Training CART
library(dplyr) #data manipulation: filter and arrange
library(tidyverse) #data manipulation
library(sf) #plot mapping
library(gganimate) #static visualization
library(MASS) #deal with data set
library(VIM) #tools for the visualization of missing or imputed values
library(glmnet) #Lasso and Elastic-Net Regularization
Ce projet vise à déterminer les caractéristiques qui sont les meilleurs indicateurs de la qualité du vin rouge et à générer un aperçu de chacun de ces facteurs pour la qualité du vin rouge de notre modèle. Par la suite, ce modèle pourra servir les entreprises du secteur à prédire la qualité de leur vin.
df<-read.csv("winequality-red.csv")
dim(df)
## [1] 1599 12
str(df)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
summary(df)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
num.file <- paste(getwd(), "/dqames_num.csv", sep= "")
cat.file <- paste(getwd(), "/dqames_cat.csv", sep= "")
checkDataQuality(data= df, out.file.num= num.file, out.file.cat= cat.file)
## Check for numeric variables completed // Results saved to disk // Time difference of 0.02191997 secs
## // Time difference of 0.000003099442 secs
aggr_plot <- aggr(df, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(df), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## fixed.acidity 0
## volatile.acidity 0
## citric.acid 0
## residual.sugar 0
## chlorides 0
## free.sulfur.dioxide 0
## total.sulfur.dioxide 0
## density 0
## pH 0
## sulphates 0
## alcohol 0
## quality 0
set.seed(123)
library(dplyr)
library(tidyr)
cor(df) %>%
as.data.frame() %>%
mutate(var1 = rownames(.)) %>%
gather(var2, value, -var1) %>%
arrange(desc(value)) %>%
group_by(value) %>%
filter(row_number()==1)
## # A tibble: 67 × 3
## # Groups: value [67]
## var1 var2 value
## <chr> <chr> <dbl>
## 1 fixed.acidity fixed.acidity 1
## 2 citric.acid fixed.acidity 0.672
## 3 density fixed.acidity 0.668
## 4 total.sulfur.dioxide free.sulfur.dioxide 0.668
## 5 quality alcohol 0.476
## 6 sulphates chlorides 0.371
## 7 density citric.acid 0.365
## 8 density residual.sugar 0.355
## 9 sulphates citric.acid 0.313
## 10 quality sulphates 0.251
## # … with 57 more rows
dfcor <- cor(df)
corrplot(dfcor, method = "color", addCoef.col = "black",number.cex = .6,
tl.col = "black", tl.srt = 90, diag = FALSE)
dfcor <- cor(df)
quality_cor <- dfcor[,12]
absoutcome_cor <- abs(quality_cor)
head(absoutcome_cor[order(absoutcome_cor, decreasing = TRUE)],12)
## quality alcohol volatile.acidity
## 1.00000000 0.47616632 0.39055778
## sulphates citric.acid total.sulfur.dioxide
## 0.25139708 0.22637251 0.18510029
## density chlorides fixed.acidity
## 0.17491923 0.12890656 0.12405165
## pH free.sulfur.dioxide residual.sugar
## 0.05773139 0.05065606 0.01373164
q1 <- ggplot(df, aes(quality))+
geom_histogram() +
labs(title = "Histogram of quality") +
theme(plot.title=element_text(hjust=0.5)) +
geom_vline(aes(xintercept=mean(quality)), color="blue", linetype="dashed", size=1) +
geom_text(aes(x=5.6, label="Mean Value", y=400), colour="red", angle=90, vjust = 1.2, text=element_text(size=11))
q1
q2 <- ggplot(df, aes(sample=quality)) +
stat_qq(color="dodgerblue4") +
stat_qq_line(color="red") +
scale_y_continuous(labels=function(y){y/10^6}) +
labs(title="QQ Plot for quality", y="Ordered Values") +
theme(plot.title=element_text(hjust=0.5))
q2
p1 <- ggplot(df, aes(x=fixed.acidity)) +
geom_density()
p1 + geom_vline(aes(xintercept=mean(fixed.acidity)),
color="blue", linetype="dashed", size=1)
p2 <- ggplot(df, aes(x=volatile.acidity)) +
geom_density()
p2 + geom_vline(aes(xintercept=mean(volatile.acidity)),
color="blue", linetype="dashed", size=1)
p3 <- ggplot(df, aes(x=citric.acid)) +
geom_density()
p3 + geom_vline(aes(xintercept=mean(citric.acid)),
color="blue", linetype="dashed", size=1)
p4 <- ggplot(df, aes(x=residual.sugar)) +
geom_density()
p4 + geom_vline(aes(xintercept=mean(residual.sugar)),
color="blue", linetype="dashed", size=1)
p5 <- ggplot(df, aes(x=chlorides)) +
geom_density()
p5 + geom_vline(aes(xintercept=mean(chlorides)),
color="blue", linetype="dashed", size=1)
p6 <- ggplot(df, aes(x=free.sulfur.dioxide)) +
geom_density()
p6 + geom_vline(aes(xintercept=mean(free.sulfur.dioxide)),
color="blue", linetype="dashed", size=1)
p7 <- ggplot(df, aes(x=total.sulfur.dioxide)) +
geom_density()
p7 + geom_vline(aes(xintercept=mean(total.sulfur.dioxide)),
color="blue", linetype="dashed", size=1)
p8 <- ggplot(df, aes(x=density)) +
geom_density()
p8 + geom_vline(aes(xintercept=mean(density)),
color="blue", linetype="dashed", size=1)
p9 <- ggplot(df, aes(x=pH)) +
geom_density()
p9 + geom_vline(aes(xintercept=mean(pH)),
color="blue", linetype="dashed", size=1)
p10 <- ggplot(df, aes(x=sulphates)) +
geom_density()
p10 + geom_vline(aes(xintercept=mean(sulphates)),
color="blue", linetype="dashed", size=1)
p11 <- ggplot(df, aes(x=alcohol)) +
geom_density()
p11 + geom_vline(aes(xintercept=mean(alcohol)),
color="blue", linetype="dashed", size=1)
ggarrange(p1, p2, p3, p4, nrow = 2, ncol =2)
ggarrange(p5, p6, p7, p8, nrow = 2, ncol =2)
ggarrange(p9, p10, p11, nrow = 2, ncol =2)
b1 <- boxplot(df$fixed.acidity, col="slategray2", pch=19)
b2 <- boxplot(df$volatile.acidity, col="slategray2", pch=19)
b3 <- boxplot(df$citric.acid, col="slategray2", pch=19)
b4 <- boxplot(df$residual.sugar, col="slategray2", pch=19)
b5 <- boxplot(df$chlorides, col="slategray2", pch=19)
b6 <- boxplot(df$free.sulfur.dioxide, col="slategray2", pch=19)
b7 <- boxplot(df$total.sulfur.dioxide, col="slategray2", pch=19)
b8 <- boxplot(df$density, col="slategray2", pch=19)
b9 <- boxplot(df$pH, col="slategray2", pch=19)
b10 <- boxplot(df$sulphates, col="slategray2", pch=19)
b11 <- boxplot(df$alcohol, col="slategray2", pch=19)
g1 <- ggplot(df, aes(factor(quality), fixed.acidity, fill=factor(quality))) +
geom_boxplot() +
labs(x = "quality", y = "fixed.acidity", title = "Boxplot of Quality vs. fixed.acidity") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g1
g2 <- ggplot(df, aes(factor(quality), volatile.acidity, fill=factor(quality))) +
geom_boxplot() +
labs(x = "quality", y = "volatile.acidity", title = "Boxplot of Quality vs. volatile.acidity") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g2
g3 <- ggplot(df, aes(factor(quality), citric.acid, fill=factor(quality))) +
geom_boxplot() +
labs(x = "quality", y = "citric.acid", title = "Boxplot of Quality vs. citric.acid") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g3
g4 <- ggplot(df, aes(factor(quality), residual.sugar, fill=factor(quality))) +
geom_boxplot() +
labs(x = "quality", y = "residual.sugar", title = "Boxplot of Quality vs. residual.sugar") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g4
ggarrange(g1, g2, g3, g4, nrow = 2, ncol =2)
# Il semble qu'il y ait une relation positive entre citric acid et quality.
# Il semble qu'il y ait une relation négative entre volatile acidity et quality.
g5 <- ggplot(df, aes(factor(quality), chlorides, fill=factor(quality))) +
geom_boxplot() +
labs(x = "Quality", y = "chlorides", title = "Boxplot of Quality vs. chlorides") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g5
g6 <- ggplot(df, aes(factor(quality), free.sulfur.dioxide, fill=factor(quality))) +
geom_boxplot() +
labs(x = "quality", y = "free.sulfur.dioxide", title = "Boxplot of quality vs. free.sulfur.dioxide") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g6
g7 <- ggplot(df, aes(factor(quality), total.sulfur.dioxide, fill=factor(quality))) +
geom_boxplot() +
labs(x = "quality", y = "total.sulfur.dioxide", title = "Boxplot of quality vs. total.sulfur.dioxide") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g7
g8 <- ggplot(df, aes(factor(quality), density, fill=factor(quality))) +
geom_boxplot() +
labs(x = "quality", y = "density", title = "Boxplot of quality vs. density") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g8
ggarrange(g5, g6, g7, g8, nrow = 2, ncol =2)
# Il semble qu'il y ait une relation négative entre density acid et quality.
g9 <- ggplot(df, aes(factor(quality), pH, fill=factor(quality))) +
geom_boxplot() +
labs(x = "quality", y = "pH", title = "Boxplot of Quality vs. pH") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g9
g10 <- ggplot(df, aes(factor(quality), sulphates, fill=factor(quality))) +
geom_boxplot() +
labs(x = "quality", y = "sulphates", title = "Boxplot of quality vs. sulphates") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g10
g11 <- ggplot(df, aes(factor(quality), alcohol, fill=factor(quality))) +
geom_boxplot() +
labs(x = "quality", y = "alcohol", title = "Boxplot of quality vs. alcohol") +
theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g11
ggarrange(g9, g10, g11, nrow = 2, ncol =2)
# Il semble qu'il y ait une relation positive entre alcohol et quality.
# Il semble qu'il y ait une relation positive entre sulphates et quality.
s1 <- ggplot(df, aes(x=fixed.acidity, y=citric.acid)) +
geom_point(color="dodgerblue4",size=0.7) +
labs(title="fixed.acidity vs. citric.acid") +
geom_smooth(formula=y~x,method=lm, color="red") +
theme(plot.title=element_text(hjust=0.5))
s1
s2 <- ggplot(df, aes(x=fixed.acidity, y=density)) +
geom_point(color="dodgerblue4",size=0.7) +
labs(title="fixed.acidity vs. density") +
geom_smooth(formula=y~x,method=lm, color="red") +
theme(plot.title=element_text(hjust=0.5))
s2
s3 <- ggplot(df, aes(x=free.sulfur.dioxide, y=total.sulfur.dioxide)) +
geom_point(color="dodgerblue4",size=0.7) +
labs(title="free.sulfur.dioxide vs. total.sulfur.dioxide") +
geom_smooth(formula=y~x,method=lm, color="red") +
theme(plot.title=element_text(hjust=0.5))
s3
s4 <- ggplot(df, aes(x=fixed.acidity, y=pH)) +
geom_point(color="dodgerblue4",size=0.7) +
labs(title="fixed.acidity vs. pH") +
geom_smooth(formula=y~x,method=lm, color="red") +
theme(plot.title=element_text(hjust=0.5))
s4
ggarrange(s1, s2, s3, s4, nrow = 2, ncol =2)
# Il semble qu'il y ait une relation positive entre fixed.acidity et citric.acid
# Il semble qu'il y ait une relation positive entre fixed.acidity et density
# Il semble qu'il y ait une relation positive entre free.sulfur.dioxide et total.sulfur.dioxide
# Il semble qu'il y ait une relation negative entre fixed.acidity et pH
i1<- ggplot(df, aes(x=factor(round(alcohol)), y=citric.acid)) +
geom_boxplot(aes(colour = factor(quality))) +
labs(title="Alcohol + Citric.Acid vs. Quality") +
theme(plot.title=element_text(hjust=0.5))
i1
i2 <- ggplot(df, aes(x=factor(round(alcohol)), y=volatile.acidity)) +
geom_boxplot(aes(colour = factor(quality))) +
labs(title="Alcohol + Volatile.Acidity vs. Quality") +
theme(plot.title=element_text(hjust=0.5))
i2
i3 <- ggplot(df, aes(x=factor(round(alcohol)), y=chlorides)) +
geom_boxplot(aes(colour = factor(quality))) +
labs(title="Alcohol + Chlorides vs. Quality") + ylim(0, 0.3)
theme(plot.title=element_text(hjust=0.5))
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
i3
lm0 <- lm(quality ~ alcohol + volatile.acidity + sulphates + citric.acid + total.sulfur.dioxide, data = df)
summary(lm0)
##
## Call:
## lm(formula = quality ~ alcohol + volatile.acidity + sulphates +
## citric.acid + total.sulfur.dioxide, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.72463 -0.38380 -0.06689 0.44606 2.14550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8431068 0.2050732 13.864 < 0.0000000000000002 ***
## alcohol 0.2953419 0.0160375 18.416 < 0.0000000000000002 ***
## volatile.acidity -1.2223102 0.1124774 -10.867 < 0.0000000000000002 ***
## sulphates 0.7207881 0.1027039 7.018 0.00000000000332 ***
## citric.acid -0.0427246 0.1035810 -0.412 0.68
## total.sulfur.dioxide -0.0022182 0.0005126 -4.327 0.00001602753699 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6552 on 1593 degrees of freedom
## Multiple R-squared: 0.3439, Adjusted R-squared: 0.3418
## F-statistic: 167 on 5 and 1593 DF, p-value: < 0.00000000000000022
lm1 <- lm(quality ~ alcohol + volatile.acidity + sulphates + total.sulfur.dioxide, data = df)
summary(lm1)
##
## Call:
## lm(formula = quality ~ alcohol + volatile.acidity + sulphates +
## total.sulfur.dioxide, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.72716 -0.38486 -0.06503 0.44980 2.13257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8258128 0.2006892 14.081 < 0.0000000000000002 ***
## alcohol 0.2953105 0.0160331 18.419 < 0.0000000000000002 ***
## volatile.acidity -1.1985632 0.0966011 -12.407 < 0.0000000000000002 ***
## sulphates 0.7121396 0.1005146 7.085 0.00000000000208 ***
## total.sulfur.dioxide -0.0022354 0.0005108 -4.376 0.00001284518270 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.655 on 1594 degrees of freedom
## Multiple R-squared: 0.3438, Adjusted R-squared: 0.3421
## F-statistic: 208.8 on 4 and 1594 DF, p-value: < 0.00000000000000022
# Define training control
set.seed(123)
train.control <- trainControl(method = "cv", number = 10) #cv Cross-Validation
# Train the model
model1 <- train(quality ~ alcohol + volatile.acidity + sulphates + total.sulfur.dioxide, data = df, method = "lm", trControl = train.control)
# Summarize the results
summary(model1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.72716 -0.38486 -0.06503 0.44980 2.13257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8258128 0.2006892 14.081 < 0.0000000000000002 ***
## alcohol 0.2953105 0.0160331 18.419 < 0.0000000000000002 ***
## volatile.acidity -1.1985632 0.0966011 -12.407 < 0.0000000000000002 ***
## sulphates 0.7121396 0.1005146 7.085 0.00000000000208 ***
## total.sulfur.dioxide -0.0022354 0.0005108 -4.376 0.00001284518270 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.655 on 1594 degrees of freedom
## Multiple R-squared: 0.3438, Adjusted R-squared: 0.3421
## F-statistic: 208.8 on 4 and 1594 DF, p-value: < 0.00000000000000022
print(model1)
## Linear Regression
##
## 1599 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1439, 1439, 1438, 1439, 1439, 1440, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.6560876 0.3429191 0.5099489
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
library(glmnet)
x <- model.matrix(quality~., df)[,-1]
y <- df$quality
mod <- cv.glmnet(as.matrix(x), y, alpha=1)
as.matrix(coef(mod, mod$lambda.min))
## s1
## (Intercept) 18.731818444
## fixed.acidity 0.020774482
## volatile.acidity -1.078204434
## citric.acid -0.164074955
## residual.sugar 0.014319348
## chlorides -1.867641097
## free.sulfur.dioxide 0.004233406
## total.sulfur.dioxide -0.003228361
## density -14.593604735
## pH -0.422494118
## sulphates 0.906073568
## alcohol 0.278563263
as.matrix(coef(mod, mod$lambda.1se))
## s1
## (Intercept) 3.1453849170
## fixed.acidity 0.0018155594
## volatile.acidity -1.0221755767
## citric.acid 0.0000000000
## residual.sugar 0.0000000000
## chlorides -0.2350727376
## free.sulfur.dioxide 0.0000000000
## total.sulfur.dioxide -0.0009327824
## density 0.0000000000
## pH 0.0000000000
## sulphates 0.4961365592
## alcohol 0.2640733125
CF <- as.matrix(coef(mod, mod$lambda.1se))
CF[CF!=0,]
## (Intercept) fixed.acidity volatile.acidity
## 3.1453849170 0.0018155594 -1.0221755767
## chlorides total.sulfur.dioxide sulphates
## -0.2350727376 -0.0009327824 0.4961365592
## alcohol
## 0.2640733125
lm2 <- lm(quality ~ fixed.acidity + volatile.acidity + chlorides + total.sulfur.dioxide + sulphates + alcohol, data=df)
summary(lm2)
##
## Call:
## lm(formula = quality ~ fixed.acidity + volatile.acidity + chlorides +
## total.sulfur.dioxide + sulphates + alcohol, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.70812 -0.37181 -0.06238 0.45933 1.99472
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.7365412 0.2325021 11.770 < 0.0000000000000002 ***
## fixed.acidity 0.0236576 0.0099187 2.385 0.0172 *
## volatile.acidity -1.0856214 0.0996323 -10.896 < 0.0000000000000002 ***
## chlorides -1.7376885 0.3913566 -4.440 0.00000960779597327 ***
## total.sulfur.dioxide -0.0021460 0.0005121 -4.191 0.00002933553690691 ***
## sulphates 0.8846921 0.1108310 7.982 0.00000000000000272 ***
## alcohol 0.2825603 0.0166180 17.003 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6504 on 1592 degrees of freedom
## Multiple R-squared: 0.3538, Adjusted R-squared: 0.3514
## F-statistic: 145.3 on 6 and 1592 DF, p-value: < 0.00000000000000022
# Define training control
set.seed(123)
train.control <- trainControl(method = "cv", number = 10) #cv Cross-Validation
# Train model
model2<- train(quality ~ fixed.acidity + volatile.acidity + chlorides + total.sulfur.dioxide + sulphates + alcohol,data = df, method = "lm",
trControl = train.control)
#results
summary(model2)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.70812 -0.37181 -0.06238 0.45933 1.99472
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.7365412 0.2325021 11.770 < 0.0000000000000002 ***
## fixed.acidity 0.0236576 0.0099187 2.385 0.0172 *
## volatile.acidity -1.0856214 0.0996323 -10.896 < 0.0000000000000002 ***
## chlorides -1.7376885 0.3913566 -4.440 0.00000960779597327 ***
## total.sulfur.dioxide -0.0021460 0.0005121 -4.191 0.00002933553690691 ***
## sulphates 0.8846921 0.1108310 7.982 0.00000000000000272 ***
## alcohol 0.2825603 0.0166180 17.003 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6504 on 1592 degrees of freedom
## Multiple R-squared: 0.3538, Adjusted R-squared: 0.3514
## F-statistic: 145.3 on 6 and 1592 DF, p-value: < 0.00000000000000022
print(model2)
## Linear Regression
##
## 1599 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1439, 1439, 1438, 1439, 1439, 1440, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.6525392 0.3496559 0.50698
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
library(randomForest)
library(mlbench)
library(caret) # use createDataPartition() function
set.seed(95014)
# partition
#Evaluation Sets
set.seed(123)
n = nrow(df)
trainIndex = sample(1:n, size = round(0.7*n), replace=FALSE)
#Crée des dataframe d'entrainemet et des test à partir d'observations
training = df[trainIndex,]
testing = df[-trainIndex,]
model3 <- randomForest(quality ~ ., training, mtry = 3,
importance = TRUE, na.action = na.omit)
print(model3)
##
## Call:
## randomForest(formula = quality ~ ., data = training, mtry = 3, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.3669176
## % Var explained: 46.23
#le graphique de l'erreur en fonction du nombre d'arbres.
plot(model3)
varImp(model3)
## Overall
## fixed.acidity 20.71249
## volatile.acidity 29.63074
## citric.acid 21.53189
## residual.sugar 14.61027
## chlorides 22.20528
## free.sulfur.dioxide 20.00519
## total.sulfur.dioxide 28.35124
## density 28.98152
## pH 20.88894
## sulphates 44.31574
## alcohol 49.35817
varImpPlot(model3,type=2)
# obtenir MSE à partir du dernier élément dans fit$mse
# qui devrait correspondre à la sortie de l'impression
model3$mse[length(model3$mse)]
## [1] 0.3669176
# prendre la racine carrée pour calculer la RMSE du modèle
sqrt(model3$mse[length(model3$mse)])
## [1] 0.6057372
# illustrons maintenant comment calculer le RMSE sur les données de test par rapport aux données de formation.
predValues <- predict(model3,testing)
# nous pouvons le calculer directement
sqrt(mean((testing$quality -predValues)^2)) #RMSE
## [1] 0.5506011
mean(abs(testing$quality -predValues)) #MAE
## [1] 0.4104835
Evaluation dataframe
Model <- c("Model 1", "Model 2", "Model 3")
R_squared <- c(0.3479, 0.3546, 0.4850)
RMSE <- c(0.6549, 0.6515, 0.5843)
MAE <- c(0.5092899, 0.5063, 0.4222)
ml <- data.frame(Model, R_squared, RMSE, MAE)
Plot
library(gridExtra)
p1 <- ggplot(ml, aes(Model, RMSE)) + geom_point(aes(colour = factor(Model), size = 4)) + labs(title="RMSE") + theme(plot.title=element_text(hjust=0.5), axis.title.y = element_blank(),axis.title.x = element_blank(), legend.position="none")
p2 <- ggplot(ml, aes(Model, R_squared)) + geom_point(aes(colour = factor(Model), size = 4)) + labs(title="R-Squared") + theme(plot.title=element_text(hjust=0.5), axis.title.y = element_blank(),axis.title.x = element_blank(), legend.position="none")
p3 <- ggplot(ml, aes(Model, MAE)) + geom_point(aes(colour = factor(Model), size = 4)) + labs(title="MAE") + theme(plot.title=element_text(hjust=0.5), axis.title.y = element_blank(),axis.title.x = element_blank(), legend.position="none")
grid.arrange(p2,p1,p3, ncol=3)